None
DATASET: Local outlet chain data including
QUESTIONS
Set up the environment & Read the datasets
# # ACCESS GOOGLE DRIVE
# from google.colab import drive
# drive.mount('/content/drive')
# # Change Directory
# import os
# dir = '/content/drive/MyDrive/Data Analyst Projects/Outlet/Datasets'
# if os.getcwd() != dir:
# os.chdir(dir)
# print(f'changed directory to {dir}')
# else:
# print(f'already in {dir}')
# import libraries
import numpy as np
import pandas as pd
from sklearn import cluster
from sklearn.cluster import KMeans
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
# Solve the blank html problem
# import plotly.io as pio
# pio.renderers.default = "notebook"
# Take a look at the files
# os.chdir('Datasets')
# ! ls
membership = pd.read_csv('Membership.csv')
cards = pd.read_csv('Cards.csv')
rewards = pd.read_csv('Rewards.csv')
transactionMeta = pd.read_csv('TransactionMeta.csv')
businessCategory = pd.read_csv('BusinessCategory.csv', dtype={'Business_ID': str})
zipcodes = pd.read_csv('Zipcodes.csv')
# Take a look at the datasets
display('membership', membership.head(2))
display('cards', cards.head(2))
display('rewards', rewards.head(2))
display('transactionMeta', transactionMeta.head(2))
display('businessCategory', businessCategory.head(2))
# Merge membership data with address
zipcodes['縣市'] = zipcodes['欄位2'].str[:3]
zipcodes['鄉鎮市區'] = zipcodes['欄位2'].str[3:]
zipcodes['City'] = zipcodes['欄位3'].str.split(',').str[1]
zipcodes['Town'] = zipcodes['欄位3'].str.split(',').str[0]
zipcodes.rename(columns={'欄位1': 'Zipcode'}, inplace=True)
zipcodes['Zipcode'] = zipcodes['Zipcode'].astype(str)
zipcodes
membership_loc = membership.merge(zipcodes[['Zipcode', 'City', 'Town']],
how='left',
left_on='Zipcode',
right_on='Zipcode')
# Merge membership with card ID
membership_full = membership_loc.merge(cards,
how = 'inner',
left_on = 'Member_ID',
right_on = 'Member_ID')
# Convert data type & handle missing values
membership_full['Age'] = [None if age == '-' else int(age) for age in membership_full['Age']]
membership_full['Gender'] = [None if sex not in ['F', 'M'] else sex for sex in membership_full['Gender']]
membership_full['Regeistered_date'] = [None if date == '-' else pd.to_datetime(date) for date in membership_full['Regeistered_date']]
membership_full['Regeistered_month'] = membership_full['Regeistered_date'].dt.month
# Clean up the Business_ID
transactionMeta['Business_ID'] = transactionMeta['Business_ID_Date'].str.split('_').str[0]
# Parse datetime
transactionMeta['Date'] = pd.to_datetime(transactionMeta['Date'])
# Join Transactions & Business Categories
transactionWithCat = transactionMeta.merge(
businessCategory[['Business_ID', 'Category']],
how='left',
left_on='Business_ID',
right_on='Business_ID'
)
# This table has no primary key!!! => Group by Date & Card_ID (See the same person on the same day as 1 visit)
transactionByVisit = (transactionMeta
.groupby(['Card_ID','Date'], as_index=False)
.agg({'Amount': 'sum'})
)
# Encoding Campaign Type
def campaignCode(c):
if ('三倍' in c) or ('3倍' in c):
return 'Triple'
elif ('兩倍' in c) or ('2倍' in c):
return 'Double'
elif '會員紅利積點' in c:
return 'Normal'
else:
return c
rewards['Campaign_code'] = rewards['Campaign'].map(lambda c: campaignCode(c))
# Get rid of odd values & translate into English
rewards_clean = rewards[rewards['Campaign_code'].isin(['Normal', 'Double', 'Triple'])]
rewards_clean = rewards_clean.rename(columns={'集點':'points'})
rewards_clean['points'] = rewards_clean['points'].astype(int)
# Calculating Recency, Frequency & Monetary
RFM = (transactionByVisit
.groupby('Card_ID')
.agg(Most_recent=('Date','max'),
Frequency=('Date','size'),
Monetary=('Amount','mean'))
)
# Feature Engineering
RFM['Recency'] = (pd.Timestamp(2020, 1, 1, 0) - RFM['Most_recent']).dt.days
RFM['log(Monetary)'] = np.log(RFM['Monetary'])
RFM['Monetary_Annual'] = RFM['Frequency'] * RFM['Monetary']
# Retrieve The design matrix for model building
RFM_design = RFM[['Recency', 'Frequency', 'log(Monetary)']]
# Standardize RFM values
means = np.mean(RFM_design, axis = 0)
std = np.std(RFM_design, axis = 0)
RFM_standardized = (RFM_design - means) / std
RFM_standardized
# Check the distribution of RFM
def plot_rfm(df, name=''):
fig, ax = plt.subplots(1, 3, figsize=(12,5))
fig.suptitle(f'{name} RFM distribution', fontsize=20)
for i, value in enumerate(['Recency', 'Frequency', 'log(Monetary)']):
ax[i].hist(df[value])
ax[i].set_title(value)
# Plot original RFM
plot_rfm(RFM_design, 'Original')
# Plot standardized RFM
plot_rfm(RFM_standardized, 'Standardized')
We segment members into 7 clusters based on Recency, Frequency & Monetary values, with the help of machine learning model (Spectral clustering). These clusters help us explore the deeper insights into member preferences &behaviors
# Prepare the result matrix
clustered = RFM_standardized.copy()
# Decide the number of cluisters
inertia = []
for i in range(1,10):
kmeans_cluster = KMeans(n_clusters = i).fit(RFM_standardized)
inertia.append(kmeans_cluster.inertia_)
# Plot the result
plt.plot(range(1,10), inertia)
plt.xlabel('# of clusters(k)')
plt.ylabel('Inertia')
plt.title('The best k lies between 4 to 7', fontsize=20);
''' Modeling '''
# K-means Clustering
kmeans_cluster = KMeans(n_clusters = 7, random_state=100).fit(RFM_standardized)
clustered['Cluster_Kmeans'] = [str(label) for label in kmeans_cluster.labels_]
# Agglomerative Clustering
agg_cluster = cluster.AgglomerativeClustering(n_clusters = 7, linkage= 'complete').fit(RFM_standardized)
clustered['Cluster_Agg'] = [str(label) for label in agg_cluster.labels_]
# Spectral Clustering
spectral_cluster = cluster.SpectralClustering(n_clusters=7, affinity="nearest_neighbors", random_state=10).fit(RFM_standardized)
clustered['Cluster_Spectral'] = [str(label) for label in spectral_cluster.labels_]
''' Plotting '''
def plot_cluster(model, name):
fig = px.scatter_3d(clustered, x='Recency', y='Frequency', z='log(Monetary)', color=model, width=600, height=500, title=name)
fig.update_layout(paper_bgcolor="LightSteelBlue") #, margin=dict(l=20, r=20, t=20, b=30)
fig.show()
plot_cluster(model='Cluster_Kmeans', name='K-means Clustering')
print()
plot_cluster(model='Cluster_Agg', name='Agglomerative Clustering')
print()
plot_cluster(model='Cluster_Spectral', name='Spectral Clustering')
We pick Spectral among all 3 models, for
model_name = {'Cluster_Kmeans': 'K-Means clustering',
'Cluster_Agg': 'Agglomerative clustering',
'Cluster_Spectral': 'Spectral clustering',
}
fig, ax = plt.subplots(1, 3, figsize=(12,5))
fig.suptitle('Clustering Models Compared', fontsize=20)
for i, (model, figName) in enumerate(model_name.items()):
ax[i].hist(clustered[model].sort_values(), edgecolor='w')
ax[i].set_title(figName)
ax[i].set_xlabel('cluster')
ax[i].set_ylabel('# of members')
ax[i].label_outer() # hide the inner overlapping ticks & labels
# Join raw RFM statistics with clustering result, and merge back to the membership_full table
RFM_clustered = RFM.join(clustered['Cluster_Spectral']).reset_index()
membership_clustered = membership_full.merge(
RFM_clustered[['Card_ID', 'Recency', 'Frequency', 'Monetary', 'Monetary_Annual', 'Cluster_Spectral']],
how = 'left',
left_on = 'Card_ID',
right_on = 'Card_ID'
)
transactionWithCat_Clustered = transactionWithCat.merge(
RFM_clustered[['Card_ID', 'Cluster_Spectral']],
how='left',
left_on='Card_ID',
right_on='Card_ID'
)
rewards_clustered = rewards_clean.merge(
RFM_clustered[['Card_ID', 'Cluster_Spectral']],
how='left',
left_on='Card_ID',
right_on='Card_ID'
)
''' Slice dfs into clusters '''
# Slice the membership table into clusters
MemberClusters = []
for i in range(7):
MemberClusters.append(membership_clustered[membership_clustered['Cluster_Spectral'] == str(i)])
# Slice the transactions table into clusters
TransClusters = []
for i in range(7):
TransClusters.append(transactionWithCat_Clustered[transactionWithCat_Clustered['Cluster_Spectral'] == str(i)])
# Slice the rewards table into clusters
RewardClusters = []
for i in range(7):
RewardClusters.append(rewards_clustered[rewards_clustered['Cluster_Spectral'] == str(i)])
These clusters can be summarized by labels
With the statistics & visualization below
# Group statistics
groups = (membership_clustered
.groupby('Cluster_Spectral', as_index=False)
.agg(Size=('Card_ID','size'),
Revenue=('Monetary_Annual','sum'),
MidR=('Recency','median'),
MidF=('Frequency','median'),
MidM=('Monetary','median'),
MinR=('Recency','min'),
MaxR=('Recency','max'),
MinF=('Frequency','min'),
MaxF=('Frequency','max'),
MinM=('Monetary','min'),
MaxM=('Monetary','max'))
)
groups['% Members'] = 100 * groups['Size'] / groups['Size'].sum()
groups['% Revenue'] = 100 * groups['Revenue'] / groups['Revenue'].sum()
groups['Customer Value'] = groups['Revenue'] / groups['Size']
# Adding a summarizing label describing each cluster
groups['label'] = ['Quarterly',
'Monthly',
'Most valuable',
'Churned (highest)',
'Semi-annually',
'Churned',
'Most loyal']
groups
fig = px.scatter_3d(groups, x='MidR', y='MidF', z='MidM', color='Cluster_Spectral', width=600, height=500,
title='Visualizing the RFM distribution of each cluster')
fig.update_layout(
# margin=dict(l=20, r=20, t=20, b=30),
paper_bgcolor="LightSteelBlue")
fig.show()
# Plot the proportion of member count & revenue by cluster
labels = [(f"{i}. " + groups['label'][i]) for i in range(7)]
x = np.arange(len(labels))
width = 0.3
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(6,8))
fig.suptitle('Cluster 2 & 6 Are Most Valuable in Terms of Avg Spending', fontsize=20)
ax1.bar(x - width / 2, groups['% Members'], width, label='% Members')
ax1.bar(x + width / 2, groups['% Revenue'], width, label='% Revenue')
ax1.set_ylabel('Proportion', fontsize=15)
ax1.legend()
ax1.grid();
# Plot the Average Annual customer value by cluster
ax2.bar(x, groups['Customer Value'], .3, color='firebrick', label='Avg spending')
ax2.tick_params(labelrotation=30, labelsize=10)
ax2.set_xlabel('Clusters', fontsize=15)
ax2.set_ylabel('Avg Annual Customer Value', fontsize=15)
ax2.legend()
ax2.grid()
plt.xticks(x, labels);
We explore the differences between clusters in terms of geography, age, gender, favored category, favored reward campaign & most popular registered month.
The differences aren't significant given the fairly small member base, but would be an useful information once the business grows.
# Plot the top 5 address (town) of each cluster
Town = []
for i in range(7):
Town.append(MemberClusters[i]['Town'].value_counts().head())
fig, ax = plt.subplots(1, 7, figsize=(20,5))
for i in range(7):
colorlist = ['grey'
if town in ['Neihu Dist.', "Da’an Dist."]
else 'firebrick'
for town in Town[i].index]
ax[i].bar(Town[i].index, Town[i], color = colorlist)
ax[i].tick_params(labelrotation=75)
ax[i].set_title(f"cluster{i}")
ax[i].set(ylabel='# of members')
ax[i].label_outer() # hide the inner overlapping ticks & labels
fig.suptitle("Popular Neighborhoods for Each Cluster aren't quite different (sotre locations in gray)", fontsize=20);
Cluster 6 has more members between age 30-40 & 60-70, while other clusters being almost identical
# Plot the age distribution by cluster
fig, ax = plt.subplots(1, 7, figsize=(20,5), sharey = True)
for i in range(7):
ax[i].hist(MemberClusters[i]['Age'], edgecolor='w', bins=np.arange(0,100,10), density=True)
ax[i].axvline(x=MemberClusters[i]['Age'].median(), color='r', label='median')
ax[i].set_title(f"cluster{i}")
ax[i].set_xticks(range(0,100,10))
ax[i].legend()
ax[i].set(xlabel='Age', ylabel='% / 10 of members')
ax[i].label_outer() # hide the inner overlapping ticks & labels
fig.suptitle('Cluster6 Has a Younger Customer Base (30-40)', fontsize=20);
# Cluster 6 vs. population
plt.figure(figsize=(5, 5))
plt.hist(MemberClusters[6]['Age'], edgecolor='w', bins=np.arange(0,100,10), density=True, label='cluster 6', alpha=.7, color='firebrick')
plt.hist(membership_clustered['Age'], edgecolor='w', bins=np.arange(0,100,10), density=True, label='Population', alpha=.5, color='steelblue')
txt = '''Cluster 6 has more members aged btw
30-40 & 60-70 than population'''
plt.figtext(.5, .9, txt, ha='center', fontsize=15)
plt.xticks(range(0,100,10))
plt.xlabel('Age')
plt.ylabel('% / 10 of members')
plt.legend();
Cluster 6 has higher % of female, while other clusters being almost the same
# Plot the gender ratio by cluster
fig, ax = plt.subplots(1, 7, figsize=(10,5), sharey=True)
for i in range(7):
genderRatio = 100 * MemberClusters[i]['Gender'].value_counts() / MemberClusters[i]['Gender'].value_counts().sum()
colorlist = ['firebrick' if sex == 'F' else 'steelblue' for sex in genderRatio.index]
ax[i].bar(genderRatio.index, genderRatio, color=colorlist)
ax[i].set_title(f"cluster{i}")
ax[i].set(ylabel='Proportion (%)')
ax[i].label_outer() # hide the inner overlapping ticks & labels
fig.suptitle('Cluster6 Has the Highest % of Female', fontsize=20);
# Zoom in - Cluster 6 vs. population
ratio6 = 100 * MemberClusters[6]['Gender'].value_counts() / MemberClusters[6]['Gender'].value_counts().sum()
ratioPop = 100 * membership_clustered['Gender'].value_counts() / membership_clustered['Gender'].value_counts().sum()
col = ['firebrick' if sex == 'F' else 'steelblue' for sex in genderRatio.index]
plt.figure(figsize=(2.5, 5))
plt.bar(ratio6.index, ratio6, color=col, alpha=.5, label='Cluster 6')
plt.bar(ratioPop.index, ratioPop, color=col, label='Population', width=.4)
plt.legend()
plt.ylabel('Proportion (%)', fontsize=12)
txt = '''Cluster 6 Has Higher %
of Female Than Population '''
plt.figtext(.5, .9, txt, ha='center', fontsize=15);
# Plot the top categories (in terms of $ spending) by cluster (wider transparent bar), compared with population (narrower solid bars)
categoryAmount = []
for i in range(7):
categoryAmount.append(
100 * TransClusters[i].groupby('Category').sum()['Amount'].sort_values(ascending=False).head(8) / TransClusters[i].groupby('Category').sum()['Amount'].sum()
)
# Count across all clusters (population)
categoryAmount.append(
100 * transactionWithCat.groupby('Category').sum()['Amount'].sort_values(ascending=False).head(8) / transactionWithCat.groupby('Category').sum()['Amount'].sum()
)
# Colormap
items = transactionWithCat.groupby('Category').sum()['Amount'].sort_values(ascending=False).head(8).index.values
colorDict = {items[i]: plt.cm.get_cmap('tab10')(i) for i in range(8)}
# Plot the distribution
txt = '''Cluster2 favors Fashion, Women's Wear & Restaurants
Cluster6 favors Sportswear, Leisure & Accessories'''
fig, ax = plt.subplots(2, 4, figsize=(20,10), sharey=True)
fig.suptitle(x=.5, y=1.0, t='Popular Categories for Each Cluster', fontsize=20)
fig.text(.5, .92, txt, ha='center', fontsize=15)
plt.subplots_adjust(wspace=0.3, hspace=0.7)
# Plot each cluster respectively
for i in range(8):
colorlist = [colorDict[cat]
if cat in colorDict.keys()
else 'firebrick'
for cat in categoryAmount[i].index]
axi = ax.ravel()[i]
# Plot Population in the background
axi.bar(categoryAmount[7].index, categoryAmount[7], color=colorDict.values(), label='Population', width=0.3)
# If not population, plot the cluster in lighter wider bars
if i != 7:
axi.set_title(f"cluster{i}", fontsize=20)
axi.bar(categoryAmount[i].index, categoryAmount[i], color=colorlist, alpha=.3, label=f"cluster{i}")
else:
axi.set_title("Population", fontsize=20)
# Labels
axi.tick_params(labelrotation=75, labelsize=15)
axi.set_ylabel('% of spending', fontsize=15)
axi.legend()
axi.grid();
# Zoom in - Plot individual cluaster against population
def plot_cat_individual(c, txt): # c: cluster index
plt.figure(figsize=(5, 3))
# Plot Population in the background
plt.bar(categoryAmount[7].index, categoryAmount[7], label='Population', width=0.3)
# plot the specific cluster
plt.bar(categoryAmount[c].index, categoryAmount[c], alpha=.3, label=f"cluster {c}", color='firebrick')
# Labels
plt.xticks(rotation=45)
plt.ylabel('% of spending', fontsize=15)
plt.legend()
plt.grid()
plt.figtext(.5, .95, txt, ha='center', fontsize=15);
txt6 = '''Compared with Population, Cluster 6 Spends More
on Sportswear, Leisurewear, Accessories & KIDS '''
txt2 = '''Compared with Population, Cluster 2 Spends More
on Fashion, Women's Wear & Restaurants '''
# Plot cluster 6
plot_cat_individual(6, txt6)
# Plot cluster 2
plot_cat_individual(2, txt2)
Reward campaign performs indifferently across all clusters. This might be because reward is not the driver for outlet shopping, but direct discounts!
# Plot the points collected by reward campaign type, by cluster
fig, ax = plt.subplots(1, 7, figsize=(20,5), sharey=True)
fig.suptitle('Reward Type Is Not What Differentiates Clusters', fontsize=20)
plt.subplots_adjust(wspace=0.3)
for i in range(7):
# Sum the amount of points earned per campaign type
camSum = RewardClusters[i][['Campaign_code', 'points']].groupby('Campaign_code').sum().loc[['Normal', 'Double', 'Triple'], :]
camRatio = 100 * camSum['points'] / camSum['points'].sum()
ax[i].bar(camRatio.index, camRatio, color = plt.cm.tab10.colors[:3])#, color=colorlist)
ax[i].set_ylabel('% of all reward points', fontsize=15)
ax[i].tick_params(labelrotation=45, labelsize=15)
ax[i].set_title(f"cluster{i}", fontsize=15)
ax[i].grid();
# Prepare the data we need - Regeistered_month
membership_clustered['Regeistered_month'] = membership_clustered['Regeistered_month'].dropna().astype(int)
# Calculate the new member by month by cluster
newMemberByMonth = membership_clustered.groupby(['Cluster_Spectral', 'Regeistered_month']).size().sort_index().reset_index().rename(columns={0: '# of new members'})
# Plot - cluster 2 & 6
fig = px.line(newMemberByMonth,
x='Regeistered_month',
y='# of new members',
color='Cluster_Spectral',
title='Most cluster 2 & 6 are acquired in November & September respectively',
width=600, height=400)
# Plot - other clusters
fig1 = px.area(newMemberByMonth,
x='Regeistered_month',
y='# of new members',
color='Cluster_Spectral',
title='Most infrequent members are acquired in Octobor',
width=600, height=400)
fig.show()
fig1.show()
Based on the previous analysis, the most valuable members (cluster 2) spend mostly on fashion, which is the biggest source of sales revenue found here. We can conclude that fashion is the most profitable category and worth greater investment
# Plot the percentage sales by category
salesByCat = transactionWithCat[['Category', 'Amount']].groupby('Category').sum().sort_values('Amount', ascending=False)
salesByCat['%'] = 100* salesByCat['Amount'] / salesByCat['Amount'].sum()
plt.bar(salesByCat.index, salesByCat['%'])
plt.xlabel('Category', fontsize=15)
plt.ylabel('% of sales', fontsize=15)
plt.title('Fashion is the biggest source of sales', fontsize=25)
plt.grid()
plt.xticks(rotation=90, fontsize=15);
Based on our analysis above, we found the preferences, shopping patterns & growth opportunities for each cluster of members
! jupyter nbconvert --to html --template full Outlet_Membership_Analysis_production.ipynb